In [1]:
import numpy as np
import pandas as pd
import calendar
import datetime
import os

# Visualisation libraries

## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output

## plotly
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px

## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})

## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
from pylab import rcParams
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")
Instacart Market Basket Analysis Dataset

In this article, we go analyze the Instacart Market Basket Analysis Dataset from Kaggle. You can download the data from Kaggle.com or from instacart.com. The data includes the following files

  • aisles.csv
  • departments.csv
  • order_products__prior.csv
  • order_products__train.csv
  • orders.csv
  • products.csv
  • sample_submission.csv

Data Description

The dataset for this competition is a relational set of files describing customers' orders over time. The goal of the competition is to predict which products will be in a user's next order. The dataset is anonymized and contains a sample of over 3 million grocery orders from more than 200,000 Instacart users. For each user, we provide between 4 and 100 of their orders, with the sequence of products purchased in each order. We also provide the week and hour of the day the order was placed and a relative measure of time between orders. For more information, see the blog post accompanying its public release.

Data Dictionary

The data dictionary is available here.

orders (3.4m rows, 206k users):

  • order_id: order identifier
  • user_id: customer identifier
  • eval_set: which evaluation set this order belongs in (see SET described below)
  • order_number: the order sequence number for this user (1 = first, n = nth)
  • order_dow: the day of the week the order was placed on
  • order_hour_of_day: the hour of the day the order was placed on
  • days_since_prior: days since the last order, capped at 30 (with NAs for order_number = 1)

products (50k rows):

  • product_id: product identifier
  • product_name: name of the product
  • aisle_id: foreign key
  • department_id: foreign key

aisles (134 rows):

  • aisle_id: aisle identifier
  • aisle: the name of the aisle

deptartments (21 rows):

  • department_id: department identifier
  • department: the name of the department

order_products__SET (30m+ rows):

  • order_id: foreign key
  • product_id: foreign key
  • add_to_cart_order: order in which each product was added to cart
  • reordered: 1 if this product has been ordered by this user in the past, 0 otherwise

where SET is one of the four following evaluation sets (eval_set in orders):

  • "prior": orders prior to that users most recent order (~3.2m orders)
  • "train": training data supplied to participants (~131k orders)
  • "test": test data reserved for machine learning competitions (~75k orders)
In [2]:
def Header(Text, L = 100, C1 = Back.BLUE, C2 = Fore.BLUE):
    print(C1 + Fore.WHITE + Style.NORMAL + Text + Style.RESET_ALL + ' ' + C2 +
          Style.NORMAL +  (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = Fore.BLUE): print(C + Style.NORMAL + L*'=' + Style.RESET_ALL)

PATH = 'Instacart'
Files = os.listdir(PATH)
Files.remove('sample_submission.csv')
Files_Info = pd.DataFrame()
for i in range(len(Files)):
    Header(Files[i])
    Temp = pd.read_csv(os.path.join(PATH, Files[i]))
    display(Temp.head(5))
    Files_Info = Files_Info.append(pd.DataFrame({'File':[Files[i]], 'Number of Instances': [Temp.shape[0]],
                                                 'Number of Attributes': [Temp.shape[1]]}), ignore_index = True)
    filename = Files[i].split('.')[0]
    globals() [filename] = pd.read_csv(os.path.join(PATH, Files[i]))
Line()
display(Files_Info.style.hide_index())
Line()
del Files, i, Temp
aisles.csv =========================================================================================
aisle_id aisle
0 1 prepared soups salads
1 2 specialty cheeses
2 3 energy granola bars
3 4 instant foods
4 5 marinades meat preparation
departments.csv ====================================================================================
department_id department
0 1 frozen
1 2 other
2 3 bakery
3 4 produce
4 5 alcohol
orders.csv =========================================================================================
order_id user_id eval_set order_number order_dow order_hour_of_day days_since_prior_order
0 2539329 1 prior 1 2 8 NaN
1 2398795 1 prior 2 3 7 15.0
2 473747 1 prior 3 3 12 21.0
3 2254736 1 prior 4 4 7 29.0
4 431534 1 prior 5 4 15 28.0
order_products_prior.csv ===========================================================================
order_id product_id add_to_cart_order reordered
0 2 33120 1 1
1 2 28985 2 1
2 2 9327 3 0
3 2 45918 4 1
4 2 30035 5 0
order_products_train.csv ===========================================================================
order_id product_id add_to_cart_order reordered
0 1 49302 1 1
1 1 11109 2 1
2 1 10246 3 0
3 1 49683 4 0
4 1 43633 5 1
products.csv =======================================================================================
product_id product_name aisle_id department_id
0 1 Chocolate Sandwich Cookies 61 19
1 2 All-Seasons Salt 104 13
2 3 Robust Golden Unsweetened Oolong Tea 94 7
3 4 Smart Ones Classic Favorites Mini Rigatoni Wit... 38 1
4 5 Green Chile Anytime Sauce 5 13
====================================================================================================
File Number of Instances Number of Attributes
aisles.csv 134 2
departments.csv 21 2
orders.csv 3421083 7
order_products_prior.csv 32434489 4
order_products_train.csv 1384617 4
products.csv 49688 4
====================================================================================================
Exploratory Data Analysis

Most Ordered Products

In [3]:
fig, ax = plt.subplots(nrows=3, ncols=1, figsize=(14, 6*3))
## The most ordered Products
Top_number = 15
Header('The Most Ordered Products', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Most_Ordered_Products = order_products_prior.groupby(['product_id'])['product_id'].agg({'count'})
Most_Ordered_Products.columns = ['Count']
Most_Ordered_Products = Most_Ordered_Products.reset_index(drop = False).sort_values(by='Count', ascending=False)
Most_Ordered_Products = pd.merge(Most_Ordered_Products, products, on='product_id', how='right')
Most_Ordered_Products = Most_Ordered_Products.reset_index(drop=True).dropna()
Most_Ordered_Products['Count'] = Most_Ordered_Products['Count'].astype(int)
display(Most_Ordered_Products.head(Top_number))

_ = sns.barplot(ax = ax[0], y='product_name', x='Count', palette='PuBu',
                edgecolor='SkyBlue',  hatch="///", data=Most_Ordered_Products.head(Top_number))
_ = sns.barplot(ax = ax[0], y='product_name', x='Count', facecolor = 'None',
                edgecolor='k',data=Most_Ordered_Products.head(Top_number))
_ = ax[0].set_title('Top %s Ordered Products' % Top_number)
_ = ax[0].set_xlim([0, 5e5])
_ = ax[0].set_ylabel('Products')
_ = ax[0].set_xlabel('Count')

## The most Common Order Size
Top_number = 30
Header('The Most Frequent Order Size', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Most_Ordered = order_products_prior.groupby(['order_id'])['product_id'].agg({'count'})
Most_Ordered.columns = ['Number of Products in an Order']
Most_Ordered = Most_Ordered.reset_index(drop = False).\
               groupby(['Number of Products in an Order'])['Number of Products in an Order'].agg({'count'})
Most_Ordered.columns = ['Count']
Most_Ordered = Most_Ordered.reset_index(drop = False).sort_values(by='Count', ascending=False)
display(Most_Ordered.head(Top_number).set_index('Number of Products in an Order').T)

_ = sns.barplot(ax = ax[1], x='Number of Products in an Order', y='Count', palette='OrRd',
                edgecolor='DarkOrange',  hatch="///", data= Most_Ordered.head(Top_number))
_ = sns.barplot(ax = ax[1], x='Number of Products in an Order', y='Count', facecolor = 'None',
                edgecolor='k',  data= Most_Ordered.head(Top_number))

_ = ax[1].set_title('%s Most Frequent Order Size' % Top_number)
_ = ax[1].set_ylim([0, 2.5e5])

## The Most Reordered Products
Top_number = 15
Header('The Most Reordered Products', L = 100, C1 = Back.GREEN, C2 = Fore.GREEN)
Temp = order_products_prior.loc[order_products_prior.reordered ==1]
Most_ReOrdered_Products = Temp.groupby(['product_id'])['product_id'].agg({'count'})
Most_ReOrdered_Products.columns = ['Count']
Most_ReOrdered_Products = Most_ReOrdered_Products.reset_index(drop = False).sort_values(by='Count', ascending=False)
Most_ReOrdered_Products = pd.merge(Most_ReOrdered_Products, products, on='product_id', how='right')
Most_ReOrdered_Products = Most_ReOrdered_Products.reset_index(drop=True).dropna()
Most_ReOrdered_Products['Count'] = Most_ReOrdered_Products['Count'].astype(int)

display(Most_ReOrdered_Products.head(Top_number))

_ = sns.barplot(ax = ax[2], y='product_name', x='Count', palette='PiYG',
                edgecolor='SkyBlue',  hatch="///", data=Most_ReOrdered_Products.head(Top_number))
_ = sns.barplot(ax = ax[2], y='product_name', x='Count', facecolor = 'None',
                edgecolor='k',data=Most_ReOrdered_Products.head(Top_number))
_ = ax[2].set_title('Top %s Reordered Products' % Top_number)
_ = ax[2].set_xlim([0, 4.5e5])
_ = ax[2].set_ylabel('Products')
_ = ax[2].set_xlabel('Count')
The Most Ordered Products ==========================================================================
product_id Count product_name aisle_id department_id
0 24852 472565 Banana 24 4
1 13176 379450 Bag of Organic Bananas 24 4
2 21137 264683 Organic Strawberries 24 4
3 21903 241921 Organic Baby Spinach 123 4
4 47209 213584 Organic Hass Avocado 24 4
5 47766 176815 Organic Avocado 24 4
6 47626 152657 Large Lemon 24 4
7 16797 142951 Strawberries 24 4
8 26209 140627 Limes 24 4
9 27845 137905 Organic Whole Milk 84 16
10 27966 137057 Organic Raspberries 123 4
11 22935 113426 Organic Yellow Onion 83 4
12 24964 109778 Organic Garlic 83 4
13 45007 104823 Organic Zucchini 83 4
14 39275 100060 Organic Blueberries 123 4
The Most Frequent Order Size =======================================================================
Number of Products in an Order 5 6 4 7 3 8 2 9 10 1 ... 21 22 23 24 25 26 27 28 29 30
Count 228330 227675 222081 220006 207027 203374 186993 184347 165550 156748 ... 41863 36368 31672 27065 23613 20283 17488 15102 13033 11251

1 rows × 30 columns

The Most Reordered Products ========================================================================
product_id Count product_name aisle_id department_id
0 24852 398609 Banana 24 4
1 13176 315913 Bag of Organic Bananas 24 4
2 21137 205845 Organic Strawberries 24 4
3 21903 186884 Organic Baby Spinach 123 4
4 47209 170131 Organic Hass Avocado 24 4
5 47766 134044 Organic Avocado 24 4
6 27845 114510 Organic Whole Milk 84 16
7 47626 106255 Large Lemon 24 4
8 27966 105409 Organic Raspberries 123 4
9 16797 99802 Strawberries 24 4
10 26209 95768 Limes 24 4
11 22935 79072 Organic Yellow Onion 83 4
12 24964 74663 Organic Garlic 83 4
13 45007 72165 Organic Zucchini 83 4
14 49683 67313 Cucumber Kirby 83 4

Orders Distributions

In [4]:
Header('Orders Distributions (Week)', L = 100)
Orders_Distributions_Week = orders.groupby(['order_dow'])['order_dow'].agg({'count'})
Orders_Distributions_Week.index = calendar.day_name[0:7]
Orders_Distributions_Week.reset_index(inplace = True, drop = False)
Orders_Distributions_Week.columns =['Day of Week', 'Count']
display(Orders_Distributions_Week.set_index('Day of Week').T)

Header('Orders Distributions (Hours)', L = 100)
Orders_Distributions_Hours = orders.groupby(['order_hour_of_day'])['order_hour_of_day'].agg({'count'}).reset_index(drop = False)
Orders_Distributions_Hours.columns = ['Hour of Day', 'Count']
display(Orders_Distributions_Hours.set_index('Hour of Day').T)
        
fig = plt.figure(figsize=(13, 6), constrained_layout=True)
ax = np.zeros(2, dtype = 'object')
gs = fig.add_gridspec(1, 3)
ax[0] = fig.add_subplot(gs[0])
ax[1] = fig.add_subplot(gs[1:])

# Left Plot
_ = sns.barplot(ax = ax[0], y='Count', x='Day of Week', palette='summer',
                edgecolor='LimeGreen',  hatch="O", data= Orders_Distributions_Week)
_ = sns.barplot(ax = ax[0], y='Count', x='Day of Week', facecolor = 'None',
                edgecolor='k',data= Orders_Distributions_Week)
_ = ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=90)
_ = ax[0].set_ylim([0, 7e5])

# Right Plot
_ = sns.barplot(ax = ax[1], y='Count', x='Hour of Day', palette='cool',
                edgecolor='RoyalBlue',  hatch="*", data= Orders_Distributions_Hours)
_ = sns.barplot(ax = ax[1], y='Count', x='Hour of Day', facecolor = 'None',
                edgecolor='k',data= Orders_Distributions_Hours)
_ = ax[1].set_ylim([0, 3e5])

fig.tight_layout()
Orders Distributions (Week) ========================================================================
Day of Week Monday Tuesday Wednesday Thursday Friday Saturday Sunday
Count 600905 587478 467260 436972 426339 453368 448761
Orders Distributions (Hours) =======================================================================
Hour of Day 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
Count 22758 12398 7539 5474 5527 9569 30529 91868 178201 257812 ... 283042 283639 272553 228795 182912 140569 104292 78109 61468 40043

1 rows × 24 columns

Most orders take place on Mondays and Tuesdays. Besides, between 9:00 AM and 5:00 PM, most orders take place daily.

In [5]:
Orders_Distributions_Total = (orders.pivot_table(index='order_dow', columns='order_hour_of_day',
                                                 values='order_id', aggfunc='count').fillna(0)/1000).round(2)
Orders_Distributions_Total.index=calendar.day_name[0:7]
Orders_Distributions_Total.T.index.name = 'Hour of Day'
display(Orders_Distributions_Total)
fig, ax = plt.subplots(figsize=(15,6))
_ = sns.heatmap(Orders_Distributions_Total, annot=True, cmap =sns.color_palette("RdYlGn", n_colors=10),
                  annot_kws={"size": 12}, linewidths=.5, ax=ax, vmin=0, vmax=60,
                  cbar_kws={ 'label': 'Number of Orders $\\times$ $10^3$', "aspect":30, "shrink": .95}) 
_ = ax.set_ylabel('Week Days')
_ = ax.set_xlabel('Hour of the day')

fig.tight_layout()
Hour of Day 0 1 2 3 4 5 6 7 8 9 ... 14 15 16 17 18 19 20 21 22 23
Monday 3.94 2.40 1.41 0.96 0.81 1.17 3.33 12.41 28.11 40.80 ... 54.55 53.95 49.46 39.75 29.57 22.65 18.28 14.42 11.25 6.89
Tuesday 3.67 1.83 1.10 0.75 0.81 1.61 5.37 16.57 34.12 51.91 ... 46.76 46.40 44.76 36.79 28.98 22.14 16.28 11.94 8.99 5.62
Wednesday 3.06 1.57 0.94 0.72 0.74 1.40 4.76 13.24 24.64 36.31 ... 37.17 37.47 37.54 32.15 26.47 20.08 15.04 10.65 8.15 5.36
Thursday 2.95 1.50 0.95 0.65 0.72 1.36 4.56 12.40 22.55 32.31 ... 34.77 35.99 35.27 30.37 25.00 19.25 13.80 10.28 8.24 5.18
Friday 2.64 1.51 0.90 0.69 0.73 1.33 4.40 12.49 21.81 31.41 ... 33.62 34.22 34.09 29.38 24.42 19.35 14.19 10.80 8.81 5.64
Saturday 3.19 1.67 1.02 0.84 0.91 1.57 4.87 13.43 24.02 34.23 ... 37.41 37.51 35.86 29.96 24.31 18.74 13.32 9.52 7.50 5.26
Sunday 3.31 1.92 1.21 0.86 0.80 1.14 3.24 11.32 22.96 30.84 ... 38.75 38.09 35.56 30.40 24.16 18.35 13.39 10.50 8.53 6.09

7 rows × 24 columns

Between 9:00 AM and 5:00 PM, most orders take place on Mondays and Tuesdays.

Days after the First Order

In [6]:
Days_since_prior_order_summary = pd.DataFrame(orders.days_since_prior_order.value_counts().sort_index())
Days_since_prior_order_summary = Days_since_prior_order_summary.rename(columns={'days_since_prior_order':'Count'})
Days_since_prior_order_summary.index = np.array(Days_since_prior_order_summary.index,dtype=int)

fig = plt.figure(figsize=(13, 6), constrained_layout=True)
ax = np.zeros(2, dtype = 'object')
gs = fig.add_gridspec(1, 3)
ax[0] = fig.add_subplot(gs[0])
ax[1] = fig.add_subplot(gs[1:])

_ = ax[0].boxplot(orders.days_since_prior_order.dropna(), 0, 'rs', 0)
_ = ax[0].set_xlabel('Days since prior order')
_ = ax[0].set_title('Distribution of The Number\nof Days Since Prior Order')
_ = ax[0].set_xlim([0, 20])

# # Right Plot
_ = sns.barplot(ax = ax[1], y='Count', x='index', palette='summer',
                edgecolor='DarkGreen',  hatch="..", data= Days_since_prior_order_summary.reset_index())
_ = sns.barplot(ax = ax[1], y='Count', x='index', facecolor = 'None',
                edgecolor='k', data= Days_since_prior_order_summary.reset_index())
_ = ax[1].set_ylim([0, 4e5])
_ = ax[1].set_xlabel('Days since prior order')
_ = ax[1].set_title('Distribution of The Number\nof Days Since Prior Order')

fig.tight_layout()

Usually, customers order from 5 days to 15 days after the prior order. The majority of orders take place after a week and a month after the prior order.

Products Distributions

In [7]:
Products_Detailed  = pd.merge(left =pd.merge(left=products, right= departments, how='left'), right=aisles, how='left')
Products_Detailed.columns = [x.replace('_',' ').title().replace('Id','ID') for x in Products_Detailed.columns.tolist()]
Products_Detailed['Department'] = Products_Detailed['Department'].map(lambda x: x.title())
Products_Detailed['Aisle'] = Products_Detailed['Aisle'].map(lambda x: x.title())
Products_Detailed.head(10).style.hide_index()
Out[7]:
Product ID Product Name Aisle ID Department ID Department Aisle
1 Chocolate Sandwich Cookies 61 19 Snacks Cookies Cakes
2 All-Seasons Salt 104 13 Pantry Spices Seasonings
3 Robust Golden Unsweetened Oolong Tea 94 7 Beverages Tea
4 Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce 38 1 Frozen Frozen Meals
5 Green Chile Anytime Sauce 5 13 Pantry Marinades Meat Preparation
6 Dry Nose Oil 11 11 Personal Care Cold Flu Allergy
7 Pure Coconut Water With Orange 98 7 Beverages Juice Nectars
8 Cut Russet Potatoes Steam N' Mash 116 1 Frozen Frozen Produce
9 Light Strawberry Blueberry Yogurt 120 16 Dairy Eggs Yogurt
10 Sparkling Orange Juice & Prickly Pear Beverage 115 7 Beverages Water Seltzer Sparkling Water

Products Distributions in Each Department

In [8]:
Products_by_Department = pd.DataFrame(Products_Detailed.groupby('Department')['Product ID'].count())
Products_by_Department.columns = ['Total Products']
Products_by_Department = Products_by_Department.reset_index()
Products_by_Department['Percentage'] = np.round(100* Products_by_Department['Total Products']\
                                                / Products_by_Department['Total Products'].sum(),2)
Products_by_Department.sort_values(by='Total Products', ascending=False, inplace=True)

Colors = ['LightGreen']
LC = 'DarkGreen'
fig = px.bar(Products_by_Department, x = 'Department', y='Total Products', text = 'Percentage',
                   color_discrete_sequence= Colors,
                   hover_data= Products_by_Department.columns)
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
                 showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 7e3])
fig.update_layout(title={'text': 'Total Products in Each Departments', 'x':0.5, 'y':0.94,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

It can be seen that Personal Care and Snacks are the best seller departments.

Products Distributions in Each Aisle

In [9]:
def Search_List(Key, List): return [s for s in List if Key in s]
myColors = list(mcolors.CSS4_COLORS.keys())
Temp = []
Temp.extend(Search_List('gray', Colors))
Temp.extend(Search_List('grey', Colors))
Temp.extend(Search_List('black', Colors))
myColors = list(set(Colors)-set(Temp))
In [10]:
Products_by_Aisle = pd.DataFrame(Products_Detailed.groupby(["Department", "Aisle"])["Product ID"].count())
Products_by_Aisle.columns = ['Total Products']
Products_by_Aisle = Products_by_Aisle.reset_index()
Products_by_Aisle['Percentage'] = np.round(100* Products_by_Aisle['Total Products']\
                                                / Products_by_Aisle['Total Products'].sum(),2)
Products_by_Aisle = Products_by_Aisle.sort_values(by='Total Products', ascending=False)

Products_by_Aisle = Products_by_Aisle.loc[(Products_by_Aisle['Department'] != 'Missing') &
                                          (Products_by_Aisle['Aisle'] != 'Missing') &
                                          (Products_by_Aisle['Aisle'] != 'Other')]
for d in Products_by_Aisle['Department'].unique().tolist():
    Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Percentage'] = \
    np.round(100* Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products']\
    /Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products'].sum(),2)

# Plot
font = FontProperties()
font.set_weight('bold')
    
Temp = list(np.linspace(0, 100, 6))
m = Products_by_Aisle['Percentage'].max()
for j in range(len(Temp)-1):
    if Temp[j] <= m < Temp[j+1]:
        m = Temp[j+1]
        break
        
N = len(Products_by_Aisle.Department.unique())
fpr = 4; C = 0; Ind = list(np.arange(0, N, fpr))
fig, ax = plt.subplots(nrows=int(np.ceil(N/fpr)), ncols=fpr, figsize=(16,30), sharey = True,
                       gridspec_kw =  dict(hspace=1.4, wspace=.0))
ax = ax.ravel()
_ = fig.delaxes(ax[-1])
Palette = ['Purples', 'Blues', 'Greens', 'Oranges']*5
EdgeColor = ['Indigo', 'Navy', 'ForestGreen','Salmon']*5
Hatch = ['//', 'O', '--', '\\']*5
for (aisle, PBA), ax in zip(Products_by_Aisle.groupby(['Department']), ax.flatten()):
    _ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, palette= Palette[C], edgecolor=EdgeColor[C], hatch= Hatch[C])
    _ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, facecolor = 'None', edgecolor='k')
    if C in Ind:
        _ = ax.set(xlabel = 'Aisles', ylabel='Percentage', ylim = [0, 100])
    else:
        _ = ax.set(xlabel = 'Aisles', ylabel=None, ylim = [0, m])
    _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    _ = ax.set_title(aisle)
    C +=1
    
_ = fig.suptitle(t = 'Product Distributions', y = .91,
                 fontproperties=font, fontsize = 16)


##
def Search_List(Key, List): return [s for s in List if Key in s]
myColors = list(mcolors.CSS4_COLORS.keys())
Temp = []
Temp.extend(Search_List('gray', Colors))
Temp.extend(Search_List('grey', Colors))
Temp.extend(Search_List('black', Colors))
myColors = list(set(Colors)-set(Temp))
Colors = myColors.copy()
LC = 'Black'
fig = px.bar(Products_by_Aisle, x = 'Aisle', y='Total Products', color = 'Department',
             text = 'Total Products', color_discrete_sequence= Colors, hover_data= ['Department', 'Total Products'])
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
                 showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 1.4e3])
fig.update_layout(title={'text': 'Total Products in Each Aisles', 'x':0.5, 'y':0.94,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Candy chocolate, chips pretzels, and cookies cakes are the best selling products.

Sale Distributions

In [11]:
Orders_Detailed = orders[['user_id', 'order_id']].merge(order_products_train[['order_id', 'product_id']],
                                                        how='inner', left_on='order_id', right_on='order_id')
Orders_Detailed.columns = [x.replace('_',' ').title().replace('Id','ID') for x in Orders_Detailed.columns.tolist()]
Orders_Detailed = Orders_Detailed.merge(Products_Detailed, how='inner', left_on='Product ID', right_on='Product ID')
Orders_Detailed.head(10).style.hide_index()
Out[11]:
User ID Order ID Product ID Product Name Aisle ID Department ID Department Aisle
1 1187899 196 Soda 77 7 Beverages Soft Drinks
67 2757217 196 Soda 77 7 Beverages Soft Drinks
676 632715 196 Soda 77 7 Beverages Soft Drinks
760 1167274 196 Soda 77 7 Beverages Soft Drinks
804 3347074 196 Soda 77 7 Beverages Soft Drinks
865 692135 196 Soda 77 7 Beverages Soft Drinks
992 3289231 196 Soda 77 7 Beverages Soft Drinks
1306 1269882 196 Soda 77 7 Beverages Soft Drinks
1512 1558696 196 Soda 77 7 Beverages Soft Drinks
1550 2005577 196 Soda 77 7 Beverages Soft Drinks

Best Selling Departments

In [12]:
Orders_by_Department = pd.DataFrame(Orders_Detailed.groupby('Department')['Order ID'].count())
Orders_by_Department.columns = ['Total Orders']
Orders_by_Department = Orders_by_Department.reset_index()
Orders_by_Department['Percentage'] = np.round(100* Orders_by_Department['Total Orders']\
                                              / Orders_by_Department['Total Orders'].sum(),2)

Colors = ['MediumVioletRed']
LC = 'Indigo'
fig = px.bar(Orders_by_Department, x = 'Department', y= 'Percentage', text = 'Total Orders', color_discrete_sequence= Colors,
             hover_data = Orders_by_Department.columns)
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
                 showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 30])
fig.update_layout(title={'text': 'Best Selling Departments', 'x':0.5, 'y':0.94,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()

Best Selling Aisles

In [13]:
Products_by_Aisle = pd.DataFrame(Orders_Detailed.groupby(["Department", "Aisle"])["Product ID"].count())
Products_by_Aisle.columns = ['Total Products']
Products_by_Aisle = Products_by_Aisle.reset_index()
Products_by_Aisle['Percentage'] = np.round(100* Products_by_Aisle['Total Products']\
                                                / Products_by_Aisle['Total Products'].sum(),2)
Products_by_Aisle = Products_by_Aisle.sort_values(by='Total Products', ascending=False)
Products_by_Aisle.head().style.hide_index()

Products_by_Aisle = Products_by_Aisle.loc[(Products_by_Aisle['Department'] != 'Missing') &
                                          (Products_by_Aisle['Aisle'] != 'Missing') &
                                          (Products_by_Aisle['Aisle'] != 'Other')]
for d in Products_by_Aisle['Department'].unique().tolist():
    Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Percentage'] = \
    np.round(100* Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products']\
    /Products_by_Aisle.loc[Products_by_Aisle['Department'] == d , 'Total Products'].sum(),2)

# Plot
font = FontProperties()
font.set_weight('bold')
    
Temp = list(np.linspace(0, 100, 6))
m = Products_by_Aisle['Percentage'].max()
for j in range(len(Temp)-1):
    if Temp[j] <= m < Temp[j+1]:
        m = Temp[j+1]
        break
        
N = len(Products_by_Aisle.Department.unique())
fpr = 4; C = 0;
Ind = list(np.arange(0, N, fpr))
fig, ax = plt.subplots(nrows=int(np.ceil(N/fpr)), ncols=fpr, figsize=(16,30), sharey = True,
                       gridspec_kw =  dict(hspace=1.4, wspace=.0))
ax = ax.ravel()
_ = fig.delaxes(ax[-1])
Palette = ['Purples', 'Blues', 'Greens', 'Oranges']*5
EdgeColor = ['Indigo', 'Navy', 'ForestGreen','Salmon']*5
Hatch = ['//', 'O', '--', '\\']*5
for (aisle, PBA), ax in zip(Products_by_Aisle.groupby(['Department']), ax.flatten()):
    _ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, palette= Palette[C], edgecolor=EdgeColor[C], hatch= Hatch[C])
    _ = sns.barplot(PBA['Aisle'], PBA['Percentage'] , ax=ax, facecolor = 'None', edgecolor='k')
    if C in Ind:
        _ = ax.set(xlabel = 'Aisles', ylabel='Percentage', ylim = [0, 100])
    else:
        _ = ax.set(xlabel = 'Aisles', ylabel=None, ylim = [0, m])
    _ = ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
    _ = ax.set_title(aisle)
    C +=1
    
_ = fig.suptitle(t = 'Product Sale Distributions', y = .91,
                 fontproperties=font, fontsize = 16)


##
Colors = myColors.copy()
LC = 'Black'
fig = px.bar(Products_by_Aisle, x = 'Aisle', y='Total Products', color = 'Department',
             text = 'Total Products', color_discrete_sequence= Colors,
                   hover_data= ['Department', 'Total Products'])
fig.update_layout(plot_bgcolor= 'white', barmode='stack')
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
                 showgrid=True, gridwidth=1, gridcolor='Lightgray', range=[0, 2e5])
fig.update_layout(title={'text': 'Best Selling Aisles', 'x':0.5, 'y':0.94,
                         'xanchor': 'center', 'yanchor': 'top'})
fig.show()